This notebook contains the analyses presented in our [recent preprint](https://www.biorxiv.org/content/10.1101/2022.02.23.481596v1).
import structuremap.utils
structuremap.utils.set_logger()
from structuremap.processing import download_alphafold_cif, download_alphafold_pae, format_alphafold_data, annotate_accessibility, get_smooth_score
from structuremap.processing import perform_enrichment_analysis, perform_enrichment_analysis_per_protein
from structuremap.processing import annotate_proteins_with_idr_pattern, get_extended_flexible_pattern
from structuremap.processing import get_avg_3d_dist, get_avg_1d_dist, get_proximity_pvals, evaluate_ptm_colocalization
from structuremap.plotting import plot_enrichment, plot_ptm_colocalization
import pandas as pd
import numpy as np
import os
import re
import plotly.express as px
import tqdm
from accessory_functions import *
Note: The pae_dir and cif_dir locations need to be adjusted.
pae_dir = '/Users/isabell/Downloads/alphafold_predicted_aligned_errors'
cif_dir = '/Users/isabell/Downloads/alphafold_cifs'
from pyteomics import fasta
# downloaded 30.09.2021
human_fasta = fasta.IndexedUniProt('data/human_fasta/uniprot-filtered-organism__Homo+sapiens+(Human)+[9606]_+AND+review--.fasta')
all_fasta_entries = []
for entry in human_fasta:
uniprot_id = entry.description['id']
all_fasta_entries.append(uniprot_id)
len(all_fasta_entries)
20386
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
proteins=all_fasta_entries,
out_folder=cif_dir)
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20386/20386 [00:35<00:00, 568.39it/s]
2022-04-03 12:15:46> Valid proteins: 0 2022-04-03 12:15:46> Invalid proteins: 333 2022-04-03 12:15:46> Existing proteins: 20053
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
proteins=all_fasta_entries,
out_folder=pae_dir,
)
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20386/20386 [00:35<00:00, 579.28it/s]
2022-04-03 12:16:21> Valid proteins: 0 2022-04-03 12:16:21> Invalid proteins: 333 2022-04-03 12:16:21> Existing proteins: 20053
# Test if equal protein files available in cif and pae folder
test_identical_ids('/Users/isabell/Downloads/alphafold_cifs/',
'/Users/isabell/Downloads/alphafold_predicted_aligned_errors/')
Number of unique proteins with cif and pae file: 21494
all_proteins = valid_proteins_cif + existing_proteins_cif
all_proteins = list(set(all_proteins))
%%time
alphafold_annotation = format_alphafold_data(directory=cif_dir,
protein_ids=all_proteins)
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21495/21495 [1:06:54<00:00, 5.35it/s]
CPU times: user 1h 4min 41s, sys: 1min 36s, total: 1h 6min 17s Wall time: 1h 7min 28s
# Timing:
# CPU times: user 1h 10min 3s, sys: 1min 39s, total: 1h 11min 43s
# Wall time: 1h 12min 52s
alphafold_annotation.to_csv('data/alphafold_data/alphafold_annotation.csv', index=False)
# alphafold_annotation = pd.read_csv('data/alphafold_data/alphafold_annotation.csv')
alphafold_annotation.columns
Index(['protein_id', 'protein_number', 'AA', 'position', 'quality',
'x_coord_c', 'x_coord_ca', 'x_coord_cb', 'x_coord_n', 'y_coord_c',
'y_coord_ca', 'y_coord_cb', 'y_coord_n', 'z_coord_c', 'z_coord_ca',
'z_coord_cb', 'z_coord_n', 'secondary_structure', 'structure_group',
'BEND', 'HELX', 'STRN', 'TURN', 'unstructured'],
dtype='object')
%%time
full_sphere_exposure = annotate_accessibility(
df=alphafold_annotation,
max_dist=24,
max_angle=180,
error_dir=pae_dir)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20053/20053 [12:08<00:00, 27.52it/s]
CPU times: user 9min 54s, sys: 1min 1s, total: 10min 55s Wall time: 12min 13s
full_sphere_exposure.to_csv('data/alphafold_data/full_sphere_exposure.csv', index=False)
# full_sphere_exposure = pd.read_csv('data/alphafold_data/full_sphere_exposure.csv')
alphafold_accessibility = alphafold_annotation.merge(
full_sphere_exposure, how='left', on=['protein_id','AA','position'])
%%time
half_sphere_exposure = annotate_accessibility(
df=alphafold_annotation,
max_dist=12,
max_angle=70,
error_dir=pae_dir)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20053/20053 [07:16<00:00, 45.99it/s]
CPU times: user 5min 15s, sys: 59 s, total: 6min 14s Wall time: 7min 20s
half_sphere_exposure.to_csv('data/alphafold_data/half_sphere_exposure.csv', index=False)
# half_sphere_exposure = pd.read_csv('data/alphafold_data/half_sphere_exposure.csv')
alphafold_accessibility = alphafold_accessibility.merge(
half_sphere_exposure, how='left', on=['protein_id','AA','position'])
alphafold_accessibility['high_acc_5'] = np.where(alphafold_accessibility.nAA_12_70_pae <= 5, 1, 0)
alphafold_accessibility['low_acc_5'] = np.where(alphafold_accessibility.nAA_12_70_pae > 5, 1, 0)
alphafold_accessibility_smooth = get_smooth_score(alphafold_accessibility,
np.array(['quality', 'nAA_24_180_pae']),
[10])
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20053/20053 [00:28<00:00, 693.05it/s]
alphafold_accessibility_smooth['IDR'] = np.where(alphafold_accessibility_smooth['nAA_24_180_pae_smooth10']<=34.27, 1, 0)
bincount = np.unique(alphafold_accessibility_smooth[alphafold_accessibility_smooth.IDR==0].nAA_12_70_pae.values, return_counts=True) #
bincount_df = pd.DataFrame({'pPSE':bincount[0],'count':bincount[1]})
bincount_df['cutoff'] = np.where(bincount_df.pPSE<=5, 'high exposure', 'low exposure')
bincount_df.to_csv('data/alphafold_data/pPSE_bincount_df.csv', index=False)
pPSE_cut = px.bar(bincount_df, x='pPSE',y='count', color='cutoff',
color_discrete_map={'high exposure':'rgb(177, 63, 100)',
'low exposure':'grey'},
template="simple_white",
width=500, height=300)
pPSE_cut = pPSE_cut.update_layout(legend=dict(
title='',
yanchor="top",
y=0.99,
xanchor="right",
x=0.99
))
config={'toImageButtonOptions': {'format': 'svg', 'filename':'pPAE_cutoff'}}
pPSE_cut.show(config=config)
alphafold_accessibility_smooth_pattern = annotate_proteins_with_idr_pattern(alphafold_accessibility_smooth,
min_structured_length = 80,
max_unstructured_length = 20)
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20053/20053 [00:27<00:00, 735.02it/s]
alphafold_accessibility_smooth_pattern_ext = get_extended_flexible_pattern(
alphafold_accessibility_smooth_pattern,
['flexible_pattern'], [5])
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20053/20053 [00:20<00:00, 956.22it/s]
alphafold_accessibility_smooth_pattern_ext[0:3]
| protein_id | protein_number | AA | position | quality | x_coord_c | x_coord_ca | x_coord_cb | x_coord_n | y_coord_c | ... | unstructured | nAA_24_180_pae | nAA_12_70_pae | high_acc_5 | low_acc_5 | quality_smooth10 | nAA_24_180_pae_smooth10 | IDR | flexible_pattern | flexible_pattern_extended_5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | A0A024RBG1 | 1 | M | 1 | 53.51 | 3.607 | 3.626 | 2.498 | 3.451 | -12.413 | ... | 1 | 12 | 0 | 1 | 0 | 71.773636 | 29.818182 | 1 | 0 | 0 |
| 1 | A0A024RBG1 | 1 | M | 2 | 62.17 | 3.375 | 4.515 | 5.852 | 4.549 | -10.070 | ... | 1 | 22 | 0 | 1 | 0 | 73.706667 | 32.500000 | 1 | 0 | 0 |
| 2 | A0A024RBG1 | 1 | K | 3 | 67.26 | 2.350 | 1.665 | 1.009 | 2.664 | -8.174 | ... | 1 | 29 | 0 | 1 | 0 | 75.262308 | 34.153846 | 1 | 0 | 0 |
3 rows × 33 columns
proteins_with_pattern = alphafold_accessibility_smooth_pattern_ext[alphafold_accessibility_smooth_pattern_ext.flexible_pattern==1].protein_id.unique()
textfile = open("data/short_idrs/proteins_with_pattern.txt", "w")
for element in proteins_with_pattern:
textfile.write(element + "\n")
all_proteins = alphafold_accessibility_smooth_pattern_ext.protein_id.unique()
textfile = open("data/short_idrs/all_proteins.txt", "w")
for element in all_proteins:
textfile.write(element + "\n")
textfile.close()
print(len(proteins_with_pattern))
2454
print(len(all_proteins))
20053
DAVID platform version 6.8
enrichment_david = pd.read_csv('data/short_idrs/pattern_enrichment.txt', sep='\t')
plot_enrichment_david(enrichment_david, count_threshold=15)
Here we read kinase substructures annotated in KinaseMD.
loop_info = pd.read_csv('data/kinase_substructures/kinasemd_substrucutres.tsv', sep='\t')
len(loop_info.uniprot_id.unique())
388
loop_list = extract_loop_annotation(loop_info)
alphafold_loops = alphafold_accessibility_smooth_pattern_ext.copy(deep=True)
for l in loop_list:
alphafold_loops = alphafold_loops.merge(l, how='left', on=['protein_id','position'])
alphafold_loops = alphafold_loops.fillna(0)
alphafold_loops[0:3]
| protein_id | protein_number | AA | position | quality | x_coord_c | x_coord_ca | x_coord_cb | x_coord_n | y_coord_c | ... | high_acc_5 | low_acc_5 | quality_smooth10 | nAA_24_180_pae_smooth10 | IDR | flexible_pattern | flexible_pattern_extended_5 | gloop | aloop | achelix | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | A0A024RBG1 | 1 | M | 1 | 53.51 | 3.607 | 3.626 | 2.498 | 3.451 | -12.413 | ... | 1 | 0 | 71.773636 | 29.818182 | 1 | 0 | 0 | 0.0 | 0.0 | 0.0 |
| 1 | A0A024RBG1 | 1 | M | 2 | 62.17 | 3.375 | 4.515 | 5.852 | 4.549 | -10.070 | ... | 1 | 0 | 73.706667 | 32.500000 | 1 | 0 | 0 | 0.0 | 0.0 | 0.0 |
| 2 | A0A024RBG1 | 1 | K | 3 | 67.26 | 2.350 | 1.665 | 1.009 | 2.664 | -8.174 | ... | 1 | 0 | 75.262308 | 34.153846 | 1 | 0 | 0 | 0.0 | 0.0 | 0.0 |
3 rows × 36 columns
alphafold_ptms = annotate_ptm_data(alphafold_loops)
0%| | 0/7 [00:00<?, ?it/s]
ubi_lib_only_treated.csv
14%|████████████████▊ | 1/7 [00:25<02:33, 25.51s/it]
phosphositeplus_annotation.csv
29%|█████████████████████████████████▋ | 2/7 [00:53<02:13, 26.77s/it]
p_ochoa.csv
43%|██████████████████████████████████████████████████▌ | 3/7 [01:26<01:59, 29.77s/it]
ubi_lib_shared.csv
57%|███████████████████████████████████████████████████████████████████▍ | 4/7 [02:02<01:37, 32.36s/it]
ubi_lib_only_untreated.csv
71%|████████████████████████████████████████████████████████████████████████████████████▎ | 5/7 [02:47<01:13, 36.62s/it]
p_sugiyama.csv
86%|█████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 6/7 [03:29<00:38, 38.58s/it]
p_stukalov.csv
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [04:15<00:00, 36.55s/it]
# Remove non-STY sites from Ochoa et al.
for col_name in ['functional_score',
'p_functional_0',
'p_functional_5',
'p_functional_6',
'p_functional_7',
'p_functional_8',
'p_functional_9']:
alphafold_ptms[col_name] = np.where(alphafold_ptms['AA'].isin(['S','T','Y']), alphafold_ptms[col_name], 0)
print(np.unique(alphafold_ptms[alphafold_ptms.functional_score>0].AA.values, return_counts=True))
print(np.unique(alphafold_ptms[alphafold_ptms.p_functional_9>0].AA.values, return_counts=True))
(array(['S', 'T', 'Y'], dtype=object), array([78270, 21986, 5918])) (array(['S', 'T', 'Y'], dtype=object), array([224, 68, 118]))
# Annotate functional subsets of Sugiyama et al. phosphosites
alphafold_ptms['p_sugiyama_psp'] = np.where((alphafold_ptms['p_sugiyama']==1) & (alphafold_ptms['p']==1), 1, 0)
alphafold_ptms['p_sugiyama_ochoa'] = np.where((alphafold_ptms['p_sugiyama']==1) & (alphafold_ptms['p_functional_0']==1), 1, 0)
alphafold_ptms['p_sugiyama_stukalov'] = np.where((alphafold_ptms['p_sugiyama']==1) & (alphafold_ptms['p_stukalov']==1), 1, 0)
alphafold_ptms[0:3]
| protein_id | protein_number | AA | position | quality | x_coord_c | x_coord_ca | x_coord_cb | x_coord_n | y_coord_c | ... | p_functional_7 | p_functional_8 | p_functional_9 | ub_shared | ub_untreated_only | p_sugiyama | p_stukalov | p_sugiyama_psp | p_sugiyama_ochoa | p_sugiyama_stukalov | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | A0A024RBG1 | 1 | M | 1 | 53.51 | 3.607 | 3.626 | 2.498 | 3.451 | -12.413 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 |
| 1 | A0A024RBG1 | 1 | M | 2 | 62.17 | 3.375 | 4.515 | 5.852 | 4.549 | -10.070 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 |
| 2 | A0A024RBG1 | 1 | K | 3 | 67.26 | 2.350 | 1.665 | 1.009 | 2.664 | -8.174 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 |
3 rows × 64 columns
Kinases with structural annotation that have an alphafold structure
len(alphafold_ptms[(alphafold_ptms.gloop==1) | (alphafold_ptms.aloop==1) | (alphafold_ptms.achelix==1)].protein_id.unique())
365
Gloops
len(alphafold_ptms[(alphafold_ptms.gloop==1)].protein_id.unique())
171
len(alphafold_ptms[(alphafold_ptms.gloop==1) & (alphafold_ptms.flexible_pattern==1)].protein_id.unique())
0
len(alphafold_ptms[(alphafold_ptms.gloop==1) & (alphafold_ptms.flexible_pattern_extended_5==1)].protein_id.unique())
0
Calpha-Helix
len(alphafold_ptms[(alphafold_ptms.achelix==1)].protein_id.unique())
230
len(alphafold_ptms[(alphafold_ptms.achelix==1) & (alphafold_ptms.flexible_pattern==1)].protein_id.unique())
0
len(alphafold_ptms[(alphafold_ptms.achelix==1) & (alphafold_ptms.flexible_pattern_extended_5==1)].protein_id.unique())
0
Aloops
len(alphafold_ptms[(alphafold_ptms.aloop==1)].protein_id.unique())
309
len(alphafold_ptms[(alphafold_ptms.aloop==1) & (alphafold_ptms.flexible_pattern==1)].protein_id.unique())
72
len(alphafold_ptms[(alphafold_ptms.aloop==1) & (alphafold_ptms.flexible_pattern_extended_5==1)].protein_id.unique())
79
print('Proteins with overlap between Aloop and extended short IDR with regulatory phosphosites:')
len(alphafold_ptms[(alphafold_ptms.aloop==1) & (alphafold_ptms.flexible_pattern_extended_5==1) & ((alphafold_ptms.p_reg==1) | (alphafold_ptms.p_functional_5==1))].protein_id.unique())
Proteins with overlap between Aloop and extended short IDR with regulatory phosphosites:
55
print('Number of regulatory phosphosites in the overlap:')
alphafold_ptms[(alphafold_ptms.aloop==1) & (alphafold_ptms.flexible_pattern_extended_5==1) & ((alphafold_ptms.p_reg==1) | (alphafold_ptms.p_functional_5==1))].shape[0]
Number of regulatory phosphosites in the overlap:
99
print('Number of phosphosites in the overlap:')
alphafold_ptms[(alphafold_ptms.aloop==1) & (alphafold_ptms.flexible_pattern_extended_5==1) & ((alphafold_ptms.p==1) | (alphafold_ptms.p_functional_0==1))].shape[0]
Number of phosphosites in the overlap:
191
potentially_interesting_sites = alphafold_ptms[(alphafold_ptms.flexible_pattern_extended_5==1) &
((alphafold_ptms.p==1) | (alphafold_ptms.p_functional_0==1) |
(alphafold_ptms.ub==1) | (alphafold_ptms.ac==1) |
(alphafold_ptms.ga==1) | (alphafold_ptms.gl==1) |
(alphafold_ptms.m==1) | (alphafold_ptms.sm==1))][[
'protein_id','AA','position','IDR', 'flexible_pattern','flexible_pattern_extended_5','aloop',
'p','p_reg','functional_score',
'ub','ub_reg','ac','ac_reg','m','m_reg','sm','sm_reg','gl','gl_reg','ga']].reset_index(drop=True)
potentially_interesting_sites[0:3]
| protein_id | AA | position | IDR | flexible_pattern | flexible_pattern_extended_5 | aloop | p | p_reg | functional_score | ... | ub_reg | ac | ac_reg | m | m_reg | sm | sm_reg | gl | gl_reg | ga | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | A0AVT1 | K | 810 | 1 | 1 | 1 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | A1A4S6 | K | 370 | 0 | 0 | 1 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | A1A4S6 | S | 374 | 1 | 1 | 1 | 0.0 | 0.0 | 0.0 | 0.110353 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 rows × 21 columns
potentially_interesting_sites.to_csv('data/mods_in_short_idrs/potentially_interesting_sites.tsv',
sep='\t', index=False)
short_idr_info = extract_short_IDR_info(alphafold_ptms)
short_idr_info[0:4]
| protein_id | IDR_start | IDR_end | sequence | |
|---|---|---|---|---|
| 0 | A0A087X1C5 | 332 | 345 | RVSPGCPIVGTHVC |
| 1 | A0A0G2JMD5 | 242 | 271 | GYERELYVSVQWPCIPDLDSPFLCLYYPQM |
| 2 | A0A0U1RQE8 | 142 | 168 | YATNKSKLGSWAETGHPDDELESETPN |
| 3 | A0AVI2 | 296 | 318 | QALIDQKLLYGTDDTDIQIFKSA |
short_idr_info.to_csv('data/mods_in_short_idrs/short_idr_info.tsv',
sep='\t', index=False)
plot_shortIDR_activationLoop(df=alphafold_ptms, kinase='O43353')
plot_shortIDR_activationLoop(df=alphafold_ptms, kinase='P24941')
plot_shortIDR_activationLoop(df=alphafold_ptms, kinase='Q92918')
plot_shortIDR_activationLoop(df=alphafold_ptms, kinase='P45984')
plot_shortIDR_activationLoop(df=alphafold_ptms, kinase='P28482')
plot_shortIDR_activationLoop(df=alphafold_ptms, kinase='O96017')
plot_shortIDR_activationLoop(df=alphafold_ptms, kinase='P02730')
plot_shortIDR_activationLoop(df=alphafold_ptms, kinase='Q8NB16') # MLKL
plot_shortIDR_activationLoop(df=alphafold_ptms, kinase='Q13546') # RIPK1
plot_shortIDR_activationLoop(df=alphafold_ptms, kinase='P29320') # Ephrin type-A receptor 3 (EPHA3: P29320)
res = list()
for ptm_type in ['p','p_reg','p_functional_0','p_functional_5',
'ub','sm','ac','m','ga','gl']:
#'ub','ub_reg','ac','ac_reg','m','m_reg','sm','sm_reg','gl','gl_reg','ga']:
n_in_shortIDR = alphafold_ptms[(alphafold_ptms.flexible_pattern==1) & (alphafold_ptms[ptm_type]==1)].shape[0]
n_in_shortIDRextention = alphafold_ptms[(alphafold_ptms.flexible_pattern==0) & (alphafold_ptms.flexible_pattern_extended_5==1) & (alphafold_ptms[ptm_type]==1)].shape[0]
res.append([ptm_type, n_in_shortIDR, n_in_shortIDRextention])
res = pd.DataFrame(res, columns=['ptm','Short IDR','5 AA extension'])
res = pd.melt(res, id_vars=['ptm'], value_vars=['Short IDR','5 AA extension'])
res['is_p'] = np.where(res.ptm.isin(['p','p_reg','p_functional_0','p_functional_5']), 'phospho', 'other')
col_map = dict({'Short IDR':"#66985E",'5 AA extension':"#b2cbae"})
res.to_csv("data/mods_in_short_idrs/short_idr_site_summary.tsv", sep='\t', index=False)
fig = px.bar(res,x='ptm',y='value', color='variable', text='value',
facet_col='is_p', facet_col_spacing=0.1,
color_discrete_map=col_map,
barmode='group')
fig = fig.update_traces(textfont_size=8, textangle=0,
textposition="outside", #['inside', 'outside', 'auto', 'none']
cliponaxis=False)
fig = fig.update_yaxes(col=1, title='count')
fig = fig.update_yaxes(matches=None,showticklabels=True, col=2)
fig = fig.update_xaxes(matches=None, title='')
fig = fig.update_layout(template="simple_white",
width=800, height=350,
)
config={'toImageButtonOptions': {'format': 'svg', 'filename':'ptms_in_shortIDRs'}}
fig.show(config=config)
alphafold_ptms[(alphafold_ptms.flexible_pattern_extended_5==1) & (alphafold_ptms.p==1) & (alphafold_ptms.p_reg==0)].shape[0]
1437
alphafold_ptms[(alphafold_ptms.flexible_pattern_extended_5==1) & (alphafold_ptms.functional_score>0)].shape[0]
939
alphafold_ptms[(alphafold_ptms.flexible_pattern_extended_5==1) & (alphafold_ptms.functional_score>0.5)].shape[0]
144
alphafold_ptms[(alphafold_ptms.flexible_pattern_extended_5==1) & (alphafold_ptms.ub==1)].shape[0]
898
alphafold_ptms[(alphafold_ptms.flexible_pattern_extended_5==1) & (alphafold_ptms.ac==1)].shape[0]
118
alphafold_ptms[(alphafold_ptms.flexible_pattern_extended_5==1) & (alphafold_ptms.sm==1)].shape[0]
43
alphafold_ptms[(alphafold_ptms.flexible_pattern_extended_5==1) & (alphafold_ptms.m==1)].shape[0]
53
alphafold_ptms[(alphafold_ptms.flexible_pattern_extended_5==1) & (alphafold_ptms.ga==1)].shape[0]
33
alphafold_ptms[(alphafold_ptms.flexible_pattern_extended_5==1) & (alphafold_ptms.gl==1)].shape[0]
1
alphafold_ptms[(alphafold_ptms.flexible_pattern==1) & (alphafold_ptms.ub_reg==1)]
| protein_id | protein_number | AA | position | quality | x_coord_c | x_coord_ca | x_coord_cb | x_coord_n | y_coord_c | ... | p_functional_7 | p_functional_8 | p_functional_9 | ub_shared | ub_untreated_only | p_sugiyama | p_stukalov | p_sugiyama_psp | p_sugiyama_ochoa | p_sugiyama_stukalov | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1520723 | P00533 | 3032 | K | 867 | 52.88 | 0.145 | 0.776 | 1.245 | 1.904 | 23.701 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 |
| 1981532 | P13010 | 4190 | K | 439 | 90.88 | 14.595 | 14.113 | 15.283 | 13.406 | 32.897 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 |
| 1981536 | P13010 | 4190 | K | 443 | 90.00 | 7.535 | 8.478 | 8.233 | 9.898 | 37.059 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 |
| 4472419 | Q4U2R8 | 8942 | K | 315 | 62.93 | -21.612 | -22.841 | -23.135 | -24.054 | -1.698 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 |
4 rows × 64 columns
ubi_in_shortIDR_prots = list(alphafold_ptms[(alphafold_ptms.flexible_pattern==1) & (alphafold_ptms.ub_reg==1)].protein_id.unique())
def generate_ptm_site_dict(alphafold_df):
all_ptm_datasets = ['ub_treated_only', 'ac', 'ac_reg', 'ga', 'gl', 'gl_reg', 'm', 'm_reg',
'p', 'p_reg', 'sm', 'sm_reg', 'ub', 'ub_reg',
'p_functional_0', 'p_functional_5', 'p_functional_6', 'p_functional_7',
'p_functional_8', 'p_functional_9', 'p_stukalov',
'ub_shared', 'ub_untreated_only', 'p_sugiyama', 'p_sugiyama_psp',
'p_sugiyama_ochoa','p_sugiyama_stukalov']
ptm_dict = {}
for d in all_ptm_datasets:
df_d = alphafold_df[alphafold_df[d] == 1]
unique_aa = list(np.unique(df_d.AA.values))
ptm_dict.update({d: unique_aa})
return(ptm_dict)
ptm_site_dict = generate_ptm_site_dict(alphafold_ptms)
ptm_site_dict
{'ub_treated_only': ['K'],
'ac': ['K'],
'ac_reg': ['K'],
'ga': ['S', 'T'],
'gl': ['S', 'T'],
'gl_reg': ['S', 'T'],
'm': ['K', 'R'],
'm_reg': ['K', 'R'],
'p': ['S', 'T', 'Y'],
'p_reg': ['S', 'T', 'Y'],
'sm': ['K'],
'sm_reg': ['K'],
'ub': ['K'],
'ub_reg': ['K'],
'p_functional_0': ['S', 'T', 'Y'],
'p_functional_5': ['S', 'T', 'Y'],
'p_functional_6': ['S', 'T', 'Y'],
'p_functional_7': ['S', 'T', 'Y'],
'p_functional_8': ['S', 'T', 'Y'],
'p_functional_9': ['S', 'T', 'Y'],
'p_stukalov': ['S', 'T', 'Y'],
'ub_shared': ['K'],
'ub_untreated_only': ['K'],
'p_sugiyama': ['S', 'T', 'Y'],
'p_sugiyama_psp': ['S', 'T', 'Y'],
'p_sugiyama_ochoa': ['S', 'T', 'Y'],
'p_sugiyama_stukalov': ['S', 'T', 'Y']}
enrichment_res = perform_enrichment_analysis(
df=alphafold_ptms,
ptm_types=list(ptm_site_dict.keys()),
rois=['IDR'],
quality_cutoffs=[0],
ptm_site_dict=ptm_site_dict)
enrichment_res
| quality_cutoff | ptm | roi | n_aa_ptm | n_aa_roi | n_ptm_in_roi | n_ptm_not_in_roi | n_naked_in_roi | n_naked_not_in_roi | oddsr | p | p_adj_bf | p_adj_bh | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | ub_treated_only | IDR | 19517 | 256095 | 4213 | 15304 | 251882 | 330710 | 0.361440 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 0 | 0 | ac | IDR | 21202 | 256095 | 8634 | 12568 | 247461 | 333446 | 0.925688 | 5.595594e-08 | 1.510810e-06 | 6.043242e-08 |
| 0 | 0 | ac_reg | IDR | 631 | 256095 | 379 | 252 | 255716 | 345762 | 2.033565 | 1.032691e-18 | 2.788265e-17 | 1.267393e-18 |
| 0 | 0 | ga | IDR | 1641 | 673154 | 1366 | 275 | 671788 | 740721 | 5.476971 | 1.020559e-197 | 2.755509e-196 | 2.296257e-197 |
| 0 | 0 | gl | IDR | 414 | 673154 | 345 | 69 | 672809 | 740927 | 5.506221 | 1.901862e-51 | 5.135028e-50 | 2.852793e-51 |
| 0 | 0 | gl_reg | IDR | 25 | 673154 | 19 | 6 | 673135 | 740990 | 3.485881 | 4.708172e-03 | 1.271206e-01 | 4.889255e-03 |
| 0 | 0 | m | IDR | 13866 | 515468 | 8277 | 5589 | 507191 | 677630 | 1.978609 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 0 | 0 | m_reg | IDR | 176 | 515468 | 112 | 64 | 515356 | 683155 | 2.319797 | 4.338733e-08 | 1.171458e-06 | 4.881075e-08 |
| 0 | 0 | p | IDR | 214371 | 746685 | 134767 | 79604 | 611918 | 869146 | 2.404630 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 0 | 0 | p_reg | IDR | 8658 | 746685 | 5960 | 2698 | 740725 | 946052 | 2.821385 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 0 | 0 | sm | IDR | 8053 | 256095 | 4133 | 3920 | 251962 | 342094 | 1.431495 | 2.439505e-57 | 6.586664e-56 | 4.116665e-57 |
| 0 | 0 | sm_reg | IDR | 467 | 256095 | 310 | 157 | 255785 | 345857 | 2.669830 | 3.744639e-25 | 1.011053e-23 | 4.814536e-25 |
| 0 | 0 | ub | IDR | 91388 | 256095 | 29447 | 61941 | 226648 | 284073 | 0.595855 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 0 | 0 | ub_reg | IDR | 451 | 256095 | 196 | 255 | 255899 | 345759 | 1.038534 | 7.032357e-01 | 1.000000e+00 | 7.032357e-01 |
| 0 | 0 | p_functional_0 | IDR | 106174 | 746685 | 89689 | 16485 | 656996 | 932265 | 7.720170 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 0 | 0 | p_functional_5 | IDR | 10866 | 746685 | 8864 | 2002 | 737821 | 946748 | 5.681317 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 0 | 0 | p_functional_6 | IDR | 4710 | 746685 | 3722 | 988 | 742963 | 947762 | 4.805643 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 0 | 0 | p_functional_7 | IDR | 2118 | 746685 | 1586 | 532 | 745099 | 948218 | 3.793899 | 9.259594e-183 | 2.500090e-181 | 1.785779e-182 |
| 0 | 0 | p_functional_8 | IDR | 968 | 746685 | 678 | 290 | 746007 | 948460 | 2.972404 | 7.261322e-60 | 1.960557e-58 | 1.307038e-59 |
| 0 | 0 | p_functional_9 | IDR | 410 | 746685 | 290 | 120 | 746395 | 948630 | 3.071460 | 1.052608e-27 | 2.842041e-26 | 1.421021e-27 |
| 0 | 0 | p_stukalov | IDR | 15052 | 746685 | 13632 | 1420 | 733053 | 947330 | 12.406153 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 0 | 0 | ub_shared | IDR | 11741 | 256095 | 3449 | 8292 | 252646 | 337722 | 0.556008 | 6.847108e-194 | 1.848719e-192 | 1.422092e-193 |
| 0 | 0 | ub_untreated_only | IDR | 6321 | 256095 | 3157 | 3164 | 252938 | 342850 | 1.352472 | 9.990211e-33 | 2.697357e-31 | 1.419662e-32 |
| 0 | 0 | p_sugiyama | IDR | 19338 | 746685 | 8131 | 11207 | 738554 | 937543 | 0.921008 | 1.863623e-08 | 5.031783e-07 | 2.187732e-08 |
| 0 | 0 | p_sugiyama_psp | IDR | 12170 | 746685 | 6198 | 5972 | 740487 | 942778 | 1.321368 | 7.834905e-53 | 2.115424e-51 | 1.244367e-52 |
| 0 | 0 | p_sugiyama_ochoa | IDR | 8312 | 746685 | 5138 | 3174 | 741547 | 945576 | 2.064168 | 4.078817e-233 | 1.101281e-231 | 1.001164e-232 |
| 0 | 0 | p_sugiyama_stukalov | IDR | 1742 | 746685 | 1429 | 313 | 745256 | 948437 | 5.810198 | 7.750902e-234 | 2.092744e-232 | 2.092744e-233 |
# Get % Ubi in IDRs
res_ub_t = enrichment_res[enrichment_res.ptm=="ub_treated_only"]
res_ub_t.n_ptm_not_in_roi.values/res_ub_t.n_aa_ptm.values
array([0.78413691])
# Get % Ubi in IDRs
res_ub_t = enrichment_res[enrichment_res.ptm=="ub_shared"]
res_ub_t.n_ptm_not_in_roi.values/res_ub_t.n_aa_ptm.values
array([0.70624308])
# Get % Ubi in IDRs
res_ub_t = enrichment_res[enrichment_res.ptm=="ub_untreated_only"]
res_ub_t.n_ptm_not_in_roi.values/res_ub_t.n_aa_ptm.values
array([0.50055371])
plot_enrichment(data=enrichment_res,
ptm_select=['p','p_reg','ub','ub_reg','sm','sm_reg','ac','ac_reg','m','m_reg','gl','gl_reg','ga'],
roi_select=['IDR'])
/opt/anaconda3/envs/structuremap_analysis/lib/python3.8/site-packages/pandas/core/arraylike.py:397: RuntimeWarning: divide by zero encountered in log10
plot_enrichment(data=enrichment_res,
ptm_select=['ub','ub_shared','ub_treated_only','ub_untreated_only'],
roi_select=['IDR']
)
/opt/anaconda3/envs/structuremap_analysis/lib/python3.8/site-packages/pandas/core/arraylike.py:397: RuntimeWarning: divide by zero encountered in log10
plot_enrichment(data=enrichment_res,
ptm_select=['p','p_reg',
'p_stukalov',
'p_sugiyama','p_sugiyama_psp','p_sugiyama_ochoa'],
roi_select=['IDR']
)
/opt/anaconda3/envs/structuremap_analysis/lib/python3.8/site-packages/pandas/core/arraylike.py:397: RuntimeWarning: divide by zero encountered in log10
enrichment_res.to_csv("data/ptm_enrichment/ptm_enrichment_idrs.tsv", sep="\t", index=False)
enrichment_res = perform_enrichment_analysis(
df=alphafold_ptms[alphafold_ptms.IDR==0],
ptm_types=list(ptm_site_dict.keys()),
rois=['high_acc_5'],
quality_cutoffs=[0],
ptm_site_dict=ptm_site_dict)
plot_enrichment(data=enrichment_res,
ptm_select=['p','p_reg','ub','ub_reg','sm','sm_reg','ac','ac_reg','m','m_reg','gl','gl_reg','ga'],
roi_select=['high_acc_5'],
)
plot_enrichment(data=enrichment_res,
ptm_select=['ub','ub_shared','ub_treated_only','ub_untreated_only'],
roi_select=['high_acc_5'],
)
enrichment_res.to_csv("data/ptm_enrichment/ptm_enrichment_high_acc_5.tsv", sep="\t", index=False)
# test if enrichment of phosphosites in IDRs can be recovered for Sugiyama et al.
enrichment_p_inIDR_Sugiyama = perform_enrichment_analysis(
df=alphafold_ptms,
ptm_types=['p', 'p_reg', 'p_stukalov', 'p_sugiyama', 'p_sugiyama_psp', 'p_sugiyama_stukalov'],
rois=['IDR'],
ptm_site_dict = ptm_site_dict,
quality_cutoffs=[0])
# inspect
enrichment_p_inIDR_Sugiyama
| quality_cutoff | ptm | roi | n_aa_ptm | n_aa_roi | n_ptm_in_roi | n_ptm_not_in_roi | n_naked_in_roi | n_naked_not_in_roi | oddsr | p | p_adj_bf | p_adj_bh | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | p | IDR | 214371 | 746685 | 134767 | 79604 | 611918 | 869146 | 2.404630 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 0 | 0 | p_reg | IDR | 8658 | 746685 | 5960 | 2698 | 740725 | 946052 | 2.821385 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 0 | 0 | p_stukalov | IDR | 15052 | 746685 | 13632 | 1420 | 733053 | 947330 | 12.406153 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 0 | 0 | p_sugiyama | IDR | 19338 | 746685 | 8131 | 11207 | 738554 | 937543 | 0.921008 | 1.863623e-08 | 1.118174e-07 | 1.863623e-08 |
| 0 | 0 | p_sugiyama_psp | IDR | 12170 | 746685 | 6198 | 5972 | 740487 | 942778 | 1.321368 | 7.834905e-53 | 4.700943e-52 | 9.401886e-53 |
| 0 | 0 | p_sugiyama_stukalov | IDR | 1742 | 746685 | 1429 | 313 | 745256 | 948437 | 5.810198 | 7.750902e-234 | 4.650541e-233 | 1.162635e-233 |
plot_enrichment(data=enrichment_p_inIDR_Sugiyama,
ptm_select=['p', 'p_reg', 'p_stukalov', 'p_sugiyama', 'p_sugiyama_psp', 'p_sugiyama_stukalov'],
roi_select=['IDR'],
)
/opt/anaconda3/envs/structuremap_analysis/lib/python3.8/site-packages/pandas/core/arraylike.py:397: RuntimeWarning: divide by zero encountered in log10
enrichment_p_inIDR_Sugiyama.to_csv("data/ptm_enrichment/ptm_enrichment_sugiyama.tsv", sep="\t", index=False)
enrichment_per_protein_res = perform_enrichment_analysis_per_protein(df=alphafold_ptms,
ptm_types=['ub'],
rois=['IDR'],
quality_cutoffs=[0],
ptm_site_dict=ptm_site_dict)
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20053/20053 [02:26<00:00, 136.79it/s]
nonIDR_enriched_ubi = enrichment_per_protein_res[(enrichment_per_protein_res.p_adj_bh <= 0.05) & (enrichment_per_protein_res.oddsr < 1)]
annotation = pd.read_csv("data/human_fasta/uniprot-filtered-organism__Homo+sapiens+(Human)+[9606]_+AND+review--.tab", sep='\t')
annotation = annotation.rename(columns={"Entry name":"Uniprot ID", "Entry":"protein_id"})
nonIDR_enriched_ubi = nonIDR_enriched_ubi.merge(annotation, how='left', on=['protein_id'])
nonIDR_enriched_ubi = nonIDR_enriched_ubi.fillna(0)
nonIDR_enriched_ubi[0:3]
| protein_id | quality_cutoff | ptm | roi | n_aa_ptm | n_aa_roi | n_ptm_in_roi | n_ptm_not_in_roi | n_naked_in_roi | n_naked_not_in_roi | oddsr | p | p_adj_bf | p_adj_bh | Uniprot ID | Gene names | Gene ontology (molecular function) | Gene ontology (biological process) | Gene ontology (cellular component) | Protein names | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | E9PAV3 | 0 | ub | IDR | 11 | 137 | 4 | 7 | 133 | 5 | 0.021482 | 8.215603e-07 | 0.004223 | 0.000469 | NACAM_HUMAN | NACA | DNA binding [GO:0003677]; unfolded protein bin... | protein targeting to membrane [GO:0006612] | cytoplasm [GO:0005737]; nascent polypeptide-as... | Nascent polypeptide-associated complex subunit... |
| 1 | O00567 | 0 | ub | IDR | 19 | 37 | 2 | 17 | 35 | 11 | 0.036975 | 1.200681e-06 | 0.006171 | 0.000617 | NOP56_HUMAN | NOP56 NOL5A | cadherin binding [GO:0045296]; histone methylt... | rRNA processing [GO:0006364] | box C/D RNP complex [GO:0031428]; cytoplasm [G... | Nucleolar protein 56 (Nucleolar protein 5A) |
| 2 | O15021 | 0 | ub | IDR | 4 | 142 | 0 | 4 | 142 | 25 | 0.000000 | 6.906406e-04 | 1.000000 | 0.042261 | MAST4_HUMAN | MAST4 KIAA0303 | ATP binding [GO:0005524]; magnesium ion bindin... | cytoskeleton organization [GO:0007010]; intrac... | cytoplasm [GO:0005737] | Microtubule-associated serine/threonine-protei... |
nonIDR_enriched_ubi.shape
(71, 20)
len([str(x) for x in nonIDR_enriched_ubi['Gene ontology (molecular function)'] if ('RNA' in str(x) or 'DNA' in str(x) or 'nucleic acid binding' in str(x))])
57
len([str(x) for x in nonIDR_enriched_ubi['Gene ontology (molecular function)'] if ('RNA' not in str(x) and 'DNA' not in str(x) and 'nucleic acid binding' not in str(x))]
)
14
# Selected example: Ras GTPase-activating protein-binding protein 2 (G3BP2)
nonIDR_enriched_ubi[nonIDR_enriched_ubi.protein_id=="Q9UN86"].iloc[:,:15]
| protein_id | quality_cutoff | ptm | roi | n_aa_ptm | n_aa_roi | n_ptm_in_roi | n_ptm_not_in_roi | n_naked_in_roi | n_naked_not_in_roi | oddsr | p | p_adj_bf | p_adj_bh | Uniprot ID | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 67 | Q9UN86 | 0 | ub | IDR | 9 | 10 | 0 | 9 | 10 | 3 | 0.0 | 0.000462 | 1.0 | 0.031689 | G3BP2_HUMAN |
# Selected example: SET8
nonIDR_enriched_ubi[nonIDR_enriched_ubi.protein_id=="Q9NQR1"].iloc[:,:15]
| protein_id | quality_cutoff | ptm | roi | n_aa_ptm | n_aa_roi | n_ptm_in_roi | n_ptm_not_in_roi | n_naked_in_roi | n_naked_not_in_roi | oddsr | p | p_adj_bf | p_adj_bh | Uniprot ID | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 57 | Q9NQR1 | 0 | ub | IDR | 10 | 24 | 1 | 9 | 23 | 4 | 0.019324 | 0.00005 | 0.257435 | 0.006279 | KMT5A_HUMAN |
# subset data for IDRs
# subset to all annotated phosphosites in PhosphoSitePlus
pattern_df_subset_for_enrichment = alphafold_ptms[alphafold_ptms.IDR == 1]
pattern_df_subset_for_enrichment = pattern_df_subset_for_enrichment[pattern_df_subset_for_enrichment.p == 1]
pattern_df_subset_for_enrichment = pattern_df_subset_for_enrichment.reset_index(drop=True)
# test if regulatory / functional phosphosites are enriched in short IDRs compared to all other IDRs
enrichment_regP_in_shortIDRs = perform_enrichment_analysis(
df=pattern_df_subset_for_enrichment,
ptm_types=['p_reg'],
rois=['flexible_pattern'],
ptm_site_dict = ptm_site_dict,
quality_cutoffs=[0])
# inspect
enrichment_regP_in_shortIDRs
| quality_cutoff | ptm | roi | n_aa_ptm | n_aa_roi | n_ptm_in_roi | n_ptm_not_in_roi | n_naked_in_roi | n_naked_not_in_roi | oddsr | p | p_adj_bf | p_adj_bh | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | p_reg | flexible_pattern | 5960 | 946 | 64 | 5896 | 882 | 127925 | 1.574379 | 0.001066 | 0.001066 | 0.001066 |
plot_enrichment(
data=enrichment_regP_in_shortIDRs,
ptm_select=['p_reg'])
enrichment_regP_in_shortIDRs.to_csv("data/ptm_enrichment/enrichment_regP_in_shortIDRs.tsv", sep="\t", index=False)
# subset data for IDRs
# subset to all annotated phosphosites in PhosphoSitePlus
pattern_df_subset_for_enrichment = alphafold_ptms[alphafold_ptms.IDR == 1]
pattern_df_subset_for_enrichment = pattern_df_subset_for_enrichment[pattern_df_subset_for_enrichment.p_functional_0 == 1]
pattern_df_subset_for_enrichment = pattern_df_subset_for_enrichment.reset_index(drop=True)
# test if regulatory / functional phosphosites are enriched in
enrichment_regP_in_shortIDRs = perform_enrichment_analysis(
df=pattern_df_subset_for_enrichment,
ptm_types=['p_functional_5','p_functional_6','p_functional_7','p_functional_8','p_functional_9'],
rois=['flexible_pattern'],
ptm_site_dict = ptm_site_dict,
quality_cutoffs=[0])
# inspect
enrichment_regP_in_shortIDRs
| quality_cutoff | ptm | roi | n_aa_ptm | n_aa_roi | n_ptm_in_roi | n_ptm_not_in_roi | n_naked_in_roi | n_naked_not_in_roi | oddsr | p | p_adj_bf | p_adj_bh | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | p_functional_5 | flexible_pattern | 8864 | 599 | 81 | 8783 | 518 | 80307 | 1.429769 | 3.850611e-03 | 1.925305e-02 | 3.850611e-03 |
| 0 | 0 | p_functional_6 | flexible_pattern | 3722 | 599 | 57 | 3665 | 542 | 85425 | 2.451244 | 1.149745e-08 | 5.748724e-08 | 1.742083e-08 |
| 0 | 0 | p_functional_7 | flexible_pattern | 1586 | 599 | 45 | 1541 | 554 | 87549 | 4.614783 | 9.108554e-16 | 4.554277e-15 | 2.277138e-15 |
| 0 | 0 | p_functional_8 | flexible_pattern | 678 | 599 | 30 | 648 | 569 | 88442 | 7.196023 | 8.254186e-16 | 4.127093e-15 | 2.277138e-15 |
| 0 | 0 | p_functional_9 | flexible_pattern | 290 | 599 | 14 | 276 | 585 | 88814 | 7.700954 | 1.393666e-08 | 6.968331e-08 | 1.742083e-08 |
plot_enrichment(
data=enrichment_regP_in_shortIDRs,
ptm_select=['p_functional_5','p_functional_6','p_functional_7','p_functional_8','p_functional_9'])
enrichment_regP_in_shortIDRs.to_csv("data/ptm_enrichment/enrichment_regP_ochoa_in_shortIDRs.tsv", sep="\t", index=False)
kinase_motifs = pd.read_csv('data/kinase_substructures/kinase_motifs.txt',
sep='\t',
names=['enzyme','start','sty','end'])
kinase_motifs = kinase_motifs.astype('str')
kinase_motifs[0:3]
| enzyme | start | sty | end | |
|---|---|---|---|---|
| 0 | Akt kinase substrate motif | R,X,R,X,X | ST | FL |
| 1 | Akt kinase substrate motif | R,X,R,X,X | ST | nan |
| 2 | Akt kinase substrate motif | G,R,A,R,T,ST | S | FAE |
kinase_motifs['start'] = kinase_motifs['start'].apply(string_to_motif)
kinase_motifs['sty'] = kinase_motifs['sty'].apply(string_to_motif)
kinase_motifs['end'] = kinase_motifs['end'].apply(string_to_motif)
kinase_motifs['motif'] = kinase_motifs['start']+kinase_motifs['sty']+kinase_motifs['end']
kinase_motifs['mod_pos'] = [s.count(']') for s in kinase_motifs['start']]
kinase_motifs[0:3]
| enzyme | start | sty | end | motif | mod_pos | |
|---|---|---|---|---|---|---|
| 0 | Akt kinase substrate motif | [R][A-Z][R][A-Z][A-Z] | [ST] | [FL] | [R][A-Z][R][A-Z][A-Z][ST][FL] | 5 |
| 1 | Akt kinase substrate motif | [R][A-Z][R][A-Z][A-Z] | [ST] | [R][A-Z][R][A-Z][A-Z][ST] | 5 | |
| 2 | Akt kinase substrate motif | [G][R][A][R][T][ST] | [S] | [FAE] | [G][R][A][R][T][ST][S][FAE] | 6 |
from structuremap.processing import extract_motifs_in_proteome
kinase_motif_res = extract_motifs_in_proteome(alphafold_df=alphafold_ptms, motif_df=kinase_motifs)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20053/20053 [05:22<00:00, 62.11it/s]
# Format observed kinase motifs for merging with alphafold_ptms
kinase_motif_res['kinase_motif'] = 1
kinase_motif_res_sub = kinase_motif_res[['protein_id','position','AA','kinase_motif']].drop_duplicates()
alphafold_motifs = alphafold_ptms.merge(kinase_motif_res_sub, how='left', on=['protein_id','position','AA'])
alphafold_motifs = alphafold_motifs.fillna(0)
# alphafold_motifs[0:3]
# test if phosphosites are enriched in kinase motifs
enrichment_p_inMotif = perform_enrichment_analysis(
df=alphafold_motifs,
ptm_types=['p', 'p_reg', 'p_stukalov'],
rois=['kinase_motif'],
ptm_site_dict=ptm_site_dict,
quality_cutoffs=[0])
# inspect
enrichment_p_inMotif
| quality_cutoff | ptm | roi | n_aa_ptm | n_aa_roi | n_ptm_in_roi | n_ptm_not_in_roi | n_naked_in_roi | n_naked_not_in_roi | oddsr | p | p_adj_bf | p_adj_bh | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | p | kinase_motif | 214371 | 1383958 | 188583 | 25788 | 1195375 | 285689 | 1.747730 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 0 | 0 | p_reg | kinase_motif | 8658 | 1383958 | 8099 | 559 | 1375859 | 310918 | 3.274097 | 2.534710e-228 | 7.604131e-228 | 2.534710e-228 |
| 0 | 0 | p_stukalov | kinase_motif | 15052 | 1383958 | 14187 | 865 | 1369771 | 310612 | 3.719159 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
plot_enrichment(data=enrichment_p_inMotif,
ptm_select=['p', 'p_reg', 'p_stukalov'],
roi_select=['kinase_motif']
)
/opt/anaconda3/envs/structuremap_analysis/lib/python3.8/site-packages/pandas/core/arraylike.py:397: RuntimeWarning: divide by zero encountered in log10
enrichment_p_inMotif.to_csv("data/ptm_enrichment/enrichment_p_inMotif.tsv", sep="\t", index=False)
# test if phosphosites in motifs are enriched in IDRs
enrichment_p_inMotif_inIDR = perform_enrichment_analysis(
df=alphafold_motifs[alphafold_motifs.kinase_motif==1],
ptm_types=['p', 'p_reg', 'p_stukalov'],
rois=['IDR'],
ptm_site_dict=ptm_site_dict,
quality_cutoffs=[0])
# inspect
enrichment_p_inMotif_inIDR
| quality_cutoff | ptm | roi | n_aa_ptm | n_aa_roi | n_ptm_in_roi | n_ptm_not_in_roi | n_naked_in_roi | n_naked_not_in_roi | oddsr | p | p_adj_bf | p_adj_bh | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | p | IDR | 188583 | 635982 | 122170 | 66413 | 513812 | 681563 | 2.440132 | 0.0 | 0.0 | 0.0 |
| 0 | 0 | p_reg | IDR | 8099 | 635982 | 5653 | 2446 | 630329 | 745530 | 2.733508 | 0.0 | 0.0 | 0.0 |
| 0 | 0 | p_stukalov | IDR | 14187 | 635982 | 12884 | 1303 | 623098 | 746673 | 11.848964 | 0.0 | 0.0 | 0.0 |
plot_enrichment(data=enrichment_p_inMotif_inIDR,
ptm_select=['p', 'p_reg', 'p_stukalov'],
roi_select=['IDR']
)
/opt/anaconda3/envs/structuremap_analysis/lib/python3.8/site-packages/pandas/core/arraylike.py:397: RuntimeWarning: divide by zero encountered in log10
enrichment_p_inMotif_inIDR.to_csv("data/ptm_enrichment/enrichment_p_inMotif_inIDR.tsv", sep="\t", index=False)
# test if phosphosites in motifs are enriched in highAcc
enrichment_p_inMotif_inHighAcc = perform_enrichment_analysis(
df=alphafold_motifs[alphafold_motifs.kinase_motif==1],
ptm_types=['p', 'p_reg', 'p_stukalov'],
rois=['high_acc_5'],
ptm_site_dict=ptm_site_dict,
quality_cutoffs=[0])
# inspect
enrichment_p_inMotif_inHighAcc
| quality_cutoff | ptm | roi | n_aa_ptm | n_aa_roi | n_ptm_in_roi | n_ptm_not_in_roi | n_naked_in_roi | n_naked_not_in_roi | oddsr | p | p_adj_bf | p_adj_bh | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | p | high_acc_5 | 188583 | 1168938 | 170748 | 17835 | 998190 | 197185 | 1.891225 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 0 | 0 | p_reg | high_acc_5 | 8099 | 1168938 | 7428 | 671 | 1161510 | 214349 | 2.042904 | 7.251563e-85 | 2.175469e-84 | 7.251563e-85 |
| 0 | 0 | p_stukalov | high_acc_5 | 14187 | 1168938 | 14036 | 151 | 1154902 | 214869 | 17.293984 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
plot_enrichment(data=enrichment_p_inMotif_inHighAcc,
ptm_select=['p', 'p_reg', 'p_stukalov'],
roi_select=['high_acc_5']
)
/opt/anaconda3/envs/structuremap_analysis/lib/python3.8/site-packages/pandas/core/arraylike.py:397: RuntimeWarning: divide by zero encountered in log10
enrichment_p_inMotif_inHighAcc.to_csv("data/ptm_enrichment/enrichment_p_inMotif_inHighAcc.tsv", sep="\t", index=False)
kinase_substrates = pd.read_csv(
'data/kinase_substructures/kinase_substrates_Sugiyama.tsv',
sep='\t')
kinase_substrates[0:3]
| Type | Kinase | Uniprot ID | Protein description | Position | |
|---|---|---|---|---|---|
| 0 | TK | ABL1 | 1433B_HUMAN | 14-3-3 protein beta/alpha | S212 |
| 1 | TK | ABL1 | 1433B_HUMAN | 14-3-3 protein beta/alpha | Y151 |
| 2 | TK | ABL1 | 1433B_HUMAN | 14-3-3 protein beta/alpha | Y21 |
kinase_substrates['AA'] = [i[0] for i in kinase_substrates['Position']]
kinase_substrates['position'] = [int(i[1:]) for i in kinase_substrates['Position']]
kinase_substrates = kinase_substrates.merge(annotation, how='left', on=['Uniprot ID'])
kinase_substrates = kinase_substrates[["protein_id","position","AA","Kinase"]].drop_duplicates().reset_index(drop=True)
kinase_substrates[0:3]
| protein_id | position | AA | Kinase | |
|---|---|---|---|---|
| 0 | P31946 | 212 | S | ABL1 |
| 1 | P31946 | 151 | Y | ABL1 |
| 2 | P31946 | 21 | Y | ABL1 |
#Akt1
substrates_AKT1, in_AKT1, out_AKT1, sub_in_AKT1 =get_kinase_substrates(
alphafold_df=alphafold_ptms,
kinase_df=kinase_substrates,
kinase="AKT1",
max_pPSE=5,
name="data/kinase_substructures/motif/motif")
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 296/296 [04:37<00:00, 1.07it/s]
Detected: 216 Missed: 26 Too short: 0 pPSE cutoff: 54
#JNK1
substrates_JNK1, in_JNK1, out_JNK1, sub_in_JNK1 =get_kinase_substrates(
alphafold_df=alphafold_ptms,
kinase_df=kinase_substrates,
kinase="JNK1",
max_pPSE=5,
name="data/kinase_substructures/motif/motif")
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 794/794 [12:28<00:00, 1.06it/s]
Detected: 609 Missed: 59 Too short: 7 pPSE cutoff: 119
#AurC
substrates_AurC, in_AurC, out_AurC, sub_in_AurC =get_kinase_substrates(
alphafold_df=alphafold_ptms,
kinase_df=kinase_substrates,
kinase="AurC",
max_pPSE=5,
name="data/kinase_substructures/motif/motif")
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 663/663 [10:21<00:00, 1.07it/s]
Detected: 470 Missed: 46 Too short: 6 pPSE cutoff: 141
# MAPKAPK2
substrates_MAPKAPK2, in_MAPKAPK2, out_MAPKAPK2, sub_in_MAPKAPK2 =get_kinase_substrates(
alphafold_df=alphafold_ptms,
kinase_df=kinase_substrates,
kinase="MAPKAPK2",
max_pPSE=5,
name="data/kinase_substructures/motif/motif")
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 395/395 [06:08<00:00, 1.07it/s]
Detected: 285 Missed: 42 Too short: 6 pPSE cutoff: 62
alphafold_ptms_noIDRs = alphafold_ptms[(alphafold_ptms.IDR==0) | (alphafold_ptms.flexible_pattern==1)]
alphafold_ptms_onlyIDRs = alphafold_ptms[(alphafold_ptms.IDR==1) & (alphafold_ptms.flexible_pattern==0)]
self_colocalization = evaluate_ptm_colocalization(
df=alphafold_ptms_noIDRs,
ptm_target='self',
ptm_types=['p','ub','sm','ac','m','gl','ga'],
ptm_dict=ptm_site_dict,
pae_dir=pae_dir,
min_dist = 1,
max_dist = 35,
dist_step = 5)
self_colocalization.to_csv('data/proximity_analysis/Fraction_of_modified_acceptor_residues_self_noIDRs.csv', index=False)
# plot_ptm_colocalization(self_colocalization)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17501/17501 [07:42<00:00, 37.81it/s] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17501/17501 [07:20<00:00, 39.77it/s] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17501/17501 [05:40<00:00, 51.44it/s] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17501/17501 [05:44<00:00, 50.73it/s] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17501/17501 [05:33<00:00, 52.40it/s] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17501/17501 [05:22<00:00, 54.33it/s] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17501/17501 [05:19<00:00, 54.85it/s]
plot_ptm_colocalization(self_colocalization, context="3D", plot_width=1500)
p_colocalization = evaluate_ptm_colocalization(
df=alphafold_ptms_noIDRs,
ptm_target='p',
ptm_types=['ub','sm','ac','m','gl','ga'],
ptm_dict=ptm_site_dict,
pae_dir=pae_dir,
min_dist = 0,
max_dist = 35,
dist_step = 5)
p_colocalization.to_csv('data/proximity_analysis/Fraction_of_modified_acceptor_residues_p_colocalization_noIDRs.csv', index=False)
# plot_ptm_colocalization(p_colocalization)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17501/17501 [07:04<00:00, 41.22it/s] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17501/17501 [05:47<00:00, 50.37it/s] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17501/17501 [05:48<00:00, 50.21it/s] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17501/17501 [05:39<00:00, 51.54it/s] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17501/17501 [05:30<00:00, 52.93it/s] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17501/17501 [05:26<00:00, 53.64it/s]
plot_ptm_colocalization(p_colocalization, context="3D", plot_width=1500)
ub_colocalization = evaluate_ptm_colocalization(
df=alphafold_ptms_noIDRs,
ptm_target='ub',
ptm_types=['p','sm','ac','m','gl','ga'],
ptm_dict=ptm_site_dict,
pae_dir=pae_dir,
min_dist = 0,
max_dist = 35,
dist_step = 5)
ub_colocalization.to_csv('data/proximity_analysis/Fraction_of_modified_acceptor_residues_ub_colocalization_noIDRs.csv', index=False)
# plot_ptm_colocalization(ub_colocalization)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17501/17501 [06:23<00:00, 45.60it/s] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17501/17501 [05:35<00:00, 52.13it/s] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17501/17501 [05:37<00:00, 51.91it/s] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17501/17501 [05:39<00:00, 51.57it/s] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17501/17501 [05:29<00:00, 53.12it/s] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17501/17501 [05:25<00:00, 53.71it/s]
plot_ptm_colocalization(ub_colocalization, context="3D", plot_width=1500)
%%time
proximity_res_pandub_mean_plus = get_proximity_pvals(
df=alphafold_ptms_noIDRs,
ptm_types = ['p','ub'],
ptm_site_dict = ptm_site_dict,
error_dir=pae_dir,
per_site_metric= 'mean',
error_operation='plus',
n_random=10000,
random_seed=44)
cluster_prots_pandub_df = proximity_res_pandub_mean_plus[(proximity_res_pandub_mean_plus.pvalue_3d_adj_bh <= 0.05) & (proximity_res_pandub_mean_plus.n_ptms > 3)]
cluster_prots_pandub = list(cluster_prots_pandub_df.protein_id)
cluster_prots_pandub_df
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17501/17501 [2:56:22<00:00, 1.65it/s]
CPU times: user 2h 51min 54s, sys: 4min 2s, total: 2h 55min 56s Wall time: 2h 56min 32s
| protein_id | ptm | n_ptms | pvalue_1d | pvalue_3d | pvalue_1d_adj_bh | pvalue_3d_adj_bh | |
|---|---|---|---|---|---|---|---|
| 222 | A6NNF4 | ub | 15.0 | 0.0002 | 0.0002 | 0.020061 | 0.033254 |
| 325 | E2RYF6 | p | 19.0 | 0.0001 | 0.0003 | 0.012387 | 0.041013 |
| 459 | O00429 | ub | 11.0 | 0.0000 | 0.0003 | 0.000000 | 0.041013 |
| 1203 | O43852 | p | 19.0 | 0.9066 | 0.0001 | 1.000000 | 0.020281 |
| 1439 | O60861 | p | 5.0 | 0.0007 | 0.0003 | 0.041013 | 0.041013 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 17634 | Q9UPU3 | p | 9.0 | 0.0000 | 0.0000 | 0.000000 | 0.000000 |
| 18052 | Q9Y473 | p | 7.0 | 0.0000 | 0.0001 | 0.000000 | 0.020281 |
| 18194 | Q9Y5F7 | p | 4.0 | 0.0000 | 0.0000 | 0.000000 | 0.000000 |
| 18198 | Q9Y5G2 | p | 5.0 | 0.0000 | 0.0002 | 0.000000 | 0.033254 |
| 18431 | Q9Y6U3 | p | 5.0 | 0.0012 | 0.0002 | 0.055646 | 0.033254 |
133 rows × 7 columns
proximity_res_pandub_mean_plus[proximity_res_pandub_mean_plus.protein_id=="P00533"]
| protein_id | ptm | n_ptms | pvalue_1d | pvalue_3d | pvalue_1d_adj_bh | pvalue_3d_adj_bh | |
|---|---|---|---|---|---|---|---|
| 2331 | P00533 | p | 26.0 | 0.7753 | 0.0914 | 0.961364 | 0.46949 |
| 2332 | P00533 | ub | 27.0 | 0.0000 | 0.0000 | 0.000000 | 0.00000 |
with open('data/proximity_analysis/proximity_pandub_sig.txt', 'w') as f:
for item in cluster_prots_pandub:
f.write("%s\n" % item)
with open('data/proximity_analysis/proximity_pandub_all.txt', 'w') as f:
for item in list(proximity_res_pandub_mean_plus.protein_id):
f.write("%s\n" % item)
david_pandub = pd.read_csv("data/proximity_analysis/proximity_pandub_david_gocc.txt", sep="\t")
plot_enrichment_david(david_pandub, fdr_threshold=0.01, fold_enrichment_threshold=1, count_threshold=10)
%%time
proximity_res_p_stukalov_mean_plus = get_proximity_pvals(df=alphafold_ptms_noIDRs,
ptm_types = ['p_stukalov'],
ptm_site_dict = ptm_site_dict,
error_dir=pae_dir,
per_site_metric= 'mean',
error_operation='plus',
n_random=10000,
random_seed=44)
cluster_prots_p_stukalov_df = proximity_res_p_stukalov_mean_plus[(proximity_res_p_stukalov_mean_plus.pvalue_3d_adj_bh <= 0.05) & (proximity_res_p_stukalov_mean_plus.n_ptms > 3)]
cluster_prots_p_stukalov = list(cluster_prots_p_stukalov_df.protein_id)
cluster_prots_p_stukalov_df
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17501/17501 [03:03<00:00, 95.14it/s]
CPU times: user 2min 52s, sys: 14.2 s, total: 3min 6s Wall time: 3min 13s
| protein_id | ptm | n_ptms | pvalue_1d | pvalue_3d | pvalue_1d_adj_bh | pvalue_3d_adj_bh | |
|---|---|---|---|---|---|---|---|
| 53 | P08559 | p_stukalov | 6.0 | 0.0022 | 0.0001 | 0.021806 | 0.004486 |
| 67 | P15121 | p_stukalov | 4.0 | 0.6339 | 0.0028 | 0.789860 | 0.033097 |
| 68 | P15531 | p_stukalov | 6.0 | 0.0011 | 0.0038 | 0.017270 | 0.033097 |
| 80 | P21333 | p_stukalov | 26.0 | 0.0030 | 0.0087 | 0.022429 | 0.043860 |
| 84 | P23443 | p_stukalov | 4.0 | 0.0000 | 0.0017 | 0.000000 | 0.029747 |
| 290 | Q9UF83 | p_stukalov | 5.0 | 0.0165 | 0.0009 | 0.046259 | 0.023550 |
cluster_prots_p_stukalov_df.to_csv("data/proximity_analysis/cluster_prots_p_stukalov_df.csv", sep='\t', index=False )
# check background
proximity_res_p_stukalov_mean_plus[proximity_res_p_stukalov_mean_plus.n_ptms > 3]
| protein_id | ptm | n_ptms | pvalue_1d | pvalue_3d | pvalue_1d_adj_bh | pvalue_3d_adj_bh | |
|---|---|---|---|---|---|---|---|
| 15 | O60218 | p_stukalov | 6.0 | 0.2170 | 0.1613 | 0.351874 | 0.297931 |
| 19 | O75369 | p_stukalov | 13.0 | 0.7180 | 0.4944 | 0.835007 | 0.710300 |
| 34 | P00352 | p_stukalov | 5.0 | 0.1811 | 0.4954 | 0.304093 | 0.710300 |
| 35 | P02545 | p_stukalov | 4.0 | 0.0549 | 0.7003 | 0.103225 | 0.852303 |
| 36 | P04075 | p_stukalov | 4.0 | 0.0024 | 0.4341 | 0.021806 | 0.674789 |
| 39 | P04406 | p_stukalov | 12.0 | 0.0017 | 0.0569 | 0.020531 | 0.129468 |
| 45 | P06400 | p_stukalov | 5.0 | 0.7464 | 0.9602 | 0.852253 | 1.000000 |
| 47 | P06733 | p_stukalov | 7.0 | 0.9893 | 0.3448 | 1.000000 | 0.555216 |
| 49 | P06748 | p_stukalov | 6.0 | 0.1383 | 0.2729 | 0.237302 | 0.466734 |
| 51 | P07355 | p_stukalov | 6.0 | 0.0004 | 0.1146 | 0.008373 | 0.224902 |
| 53 | P08559 | p_stukalov | 6.0 | 0.0022 | 0.0001 | 0.021806 | 0.004486 |
| 65 | P14618 | p_stukalov | 5.0 | 0.0292 | 0.1792 | 0.063230 | 0.325253 |
| 67 | P15121 | p_stukalov | 4.0 | 0.6339 | 0.0028 | 0.789860 | 0.033097 |
| 68 | P15531 | p_stukalov | 6.0 | 0.0011 | 0.0038 | 0.017270 | 0.033097 |
| 73 | P17612 | p_stukalov | 5.0 | 0.5696 | 0.2864 | 0.739068 | 0.480907 |
| 75 | P18206 | p_stukalov | 10.0 | 0.0043 | 0.8198 | 0.026475 | 0.939479 |
| 76 | P18621 | p_stukalov | 4.0 | 0.0001 | 0.2377 | 0.003489 | 0.419313 |
| 77 | P18669 | p_stukalov | 4.0 | 0.6725 | 0.1242 | 0.805973 | 0.239256 |
| 80 | P21333 | p_stukalov | 26.0 | 0.0030 | 0.0087 | 0.022429 | 0.043860 |
| 84 | P23443 | p_stukalov | 4.0 | 0.0000 | 0.0017 | 0.000000 | 0.029747 |
| 85 | P23528 | p_stukalov | 6.0 | 0.5120 | 0.9927 | 0.678346 | 1.000000 |
| 100 | P29401 | p_stukalov | 6.0 | 0.0431 | 0.4947 | 0.083027 | 0.710300 |
| 108 | P35222 | p_stukalov | 7.0 | 0.3126 | 0.5195 | 0.471906 | 0.738041 |
| 126 | P49327 | p_stukalov | 11.0 | 0.9117 | 0.4794 | 0.965154 | 0.710300 |
| 128 | P49589 | p_stukalov | 5.0 | 0.1890 | 0.2842 | 0.314498 | 0.479778 |
| 135 | P51812 | p_stukalov | 4.0 | 0.2676 | 0.4269 | 0.416553 | 0.670233 |
| 144 | P55072 | p_stukalov | 5.0 | 0.3696 | 0.6792 | 0.539299 | 0.839492 |
| 147 | P60709 | p_stukalov | 5.0 | 0.1072 | 0.7204 | 0.190174 | 0.863380 |
| 156 | P68133 | p_stukalov | 6.0 | 0.2693 | 0.7872 | 0.416553 | 0.918888 |
| 168 | Q02241 | p_stukalov | 6.0 | 0.9318 | 0.9365 | 0.974024 | 0.993449 |
| 172 | Q05655 | p_stukalov | 5.0 | 0.0227 | 0.0243 | 0.054833 | 0.076168 |
| 190 | Q14156 | p_stukalov | 4.0 | 0.4495 | 0.2621 | 0.621775 | 0.457219 |
| 191 | Q14160 | p_stukalov | 5.0 | 0.5120 | 0.4594 | 0.678346 | 0.704976 |
| 200 | Q15418 | p_stukalov | 4.0 | 0.0212 | 0.0293 | 0.054444 | 0.085937 |
| 213 | Q68CZ2 | p_stukalov | 5.0 | 0.4211 | 0.6750 | 0.590292 | 0.839492 |
| 222 | Q71U36 | p_stukalov | 5.0 | 0.3903 | 0.5800 | 0.559608 | 0.788398 |
| 224 | Q7KZF4 | p_stukalov | 6.0 | 0.3995 | 0.3210 | 0.567615 | 0.524969 |
| 231 | Q86XL3 | p_stukalov | 4.0 | 0.6382 | 0.4073 | 0.792074 | 0.642674 |
| 239 | Q8NCF5 | p_stukalov | 5.0 | 0.0440 | 0.0556 | 0.084244 | 0.128371 |
| 261 | Q9BSJ8 | p_stukalov | 4.0 | 0.6270 | 0.7090 | 0.784375 | 0.852973 |
| 268 | Q9BZH6 | p_stukalov | 5.0 | 0.0070 | 0.0215 | 0.033144 | 0.070245 |
| 278 | Q9H7D0 | p_stukalov | 5.0 | 0.1992 | 0.8101 | 0.327481 | 0.935189 |
| 286 | Q9NZM1 | p_stukalov | 4.0 | 0.6412 | 0.8161 | 0.792665 | 0.938664 |
| 290 | Q9UF83 | p_stukalov | 5.0 | 0.0165 | 0.0009 | 0.046259 | 0.023550 |
| 298 | Q9UPU5 | p_stukalov | 9.0 | 0.0234 | 0.3016 | 0.054833 | 0.495824 |
| 305 | Q9Y2X3 | p_stukalov | 4.0 | 0.6869 | 0.8416 | 0.813912 | 0.948593 |
| 308 | Q9Y490 | p_stukalov | 14.0 | 0.0232 | 0.2977 | 0.054833 | 0.495824 |
len(proximity_res_p_stukalov_mean_plus[proximity_res_p_stukalov_mean_plus.n_ptms > 3].protein_id.unique())
47
alphafold_ptms[(alphafold_ptms.p==1)|(alphafold_ptms.ub==1)|(alphafold_ptms.ac==1)|(alphafold_ptms.m==1)|(alphafold_ptms.sm==1)|(alphafold_ptms.gl==1)|(alphafold_ptms.ga==1)]
| protein_id | protein_number | AA | position | quality | x_coord_c | x_coord_ca | x_coord_cb | x_coord_n | y_coord_c | ... | p_functional_7 | p_functional_8 | p_functional_9 | ub_shared | ub_untreated_only | p_sugiyama | p_stukalov | p_sugiyama_psp | p_sugiyama_ochoa | p_sugiyama_stukalov | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | A0A024RBG1 | 1 | K | 5 | 66.25 | 0.771 | 0.089 | -0.609 | 1.101 | -6.530 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 |
| 127 | A0A024RBG1 | 1 | K | 128 | 96.82 | 3.170 | 2.726 | 2.212 | 1.694 | 1.420 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 |
| 133 | A0A024RBG1 | 1 | K | 134 | 92.31 | 6.423 | 6.635 | 5.314 | 7.121 | 3.808 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 |
| 142 | A0A024RBG1 | 1 | K | 143 | 93.44 | -2.258 | -0.755 | -0.418 | -0.220 | 15.167 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 |
| 5226 | A0A075B759 | 49 | S | 40 | 98.29 | -3.166 | -2.987 | -4.066 | -2.979 | 12.084 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10479343 | Q9Y6Z7 | 20047 | K | 100 | 60.30 | -52.521 | -52.600 | -52.804 | -53.753 | -6.635 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 |
| 10479346 | Q9Y6Z7 | 20047 | K | 103 | 59.79 | -50.510 | -50.731 | -51.321 | -51.676 | -3.346 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 |
| 10479384 | Q9Y6Z7 | 20047 | S | 141 | 95.78 | -23.695 | -25.219 | -25.725 | -25.678 | 5.220 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 |
| 10479398 | Q9Y6Z7 | 20047 | T | 155 | 89.59 | -8.902 | -8.443 | -6.933 | -8.781 | -6.120 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 |
| 10479444 | Q9Y6Z7 | 20047 | K | 201 | 94.01 | -11.861 | -11.563 | -12.201 | -10.121 | 1.648 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 |
334529 rows × 64 columns
from alphamap.sequenceplot import plot_3d_structuremap, visualize_structure_in_panel
formatted_data = format_for_3Dviz(df=alphafold_ptms, ptm_dataset="p_stukalov")
plot3D_html, js_path, cif_path = plot_3d_structuremap(
df=alphafold_ptms,
organism="Human",
protein="P15121", #"P08559",
ptm_type='p_stukalov')
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2570.04it/s]
2022-04-03 20:37:27> Valid proteins: 0 2022-04-03 20:37:27> Invalid proteins: 0 2022-04-03 20:37:27> Existing proteins: 1
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 195/195 [00:00<00:00, 1448.99it/s] 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 6374.32it/s]
2022-04-03 20:37:27> Valid proteins: 0 2022-04-03 20:37:27> Invalid proteins: 0 2022-04-03 20:37:27> Existing proteins: 1
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 46.64it/s] 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 39.18it/s] 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 324.66it/s]
visualize_structure_in_panel(plot3D_html, js_path, cif_path)
2022-04-03 20:37:27> Starting Bokeh server version 2.2.2 (running on Tornado 6.1) 2022-04-03 20:37:27> User authentication hooks NOT provided (default user enabled)
! pip freeze
alphamap==0.1.10 appnope @ file:///opt/concourse/worker/volumes/live/5f13e5b3-5355-4541-5fc3-f08850c73cf9/volume/appnope_1606859448618/work argon2-cffi @ file:///opt/conda/conda-bld/argon2-cffi_1645000214183/work argon2-cffi-bindings @ file:///opt/concourse/worker/volumes/live/0628405f-5e68-4591-7d86-fffe9a78a449/volume/argon2-cffi-bindings_1644569695448/work asttokens @ file:///opt/conda/conda-bld/asttokens_1646925590279/work attrs @ file:///opt/conda/conda-bld/attrs_1642510447205/work backcall @ file:///home/ktietz/src/ci/backcall_1611930011877/work bio==1.3.6 biopython==1.79 biothings-client==0.2.6 bleach @ file:///opt/conda/conda-bld/bleach_1641577558959/work bokeh==2.2.2 certifi==2021.10.8 cffi @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/croot-2p6q7e2r/cffi_1642701115063/work charset-normalizer==2.0.12 click==8.1.2 debugpy @ file:///opt/concourse/worker/volumes/live/e17abc9f-08a9-4dca-4e9b-e6d24bde47ce/volume/debugpy_1637091828617/work decorator @ file:///opt/conda/conda-bld/decorator_1643638310831/work defusedxml @ file:///tmp/build/80754af9/defusedxml_1615228127516/work entrypoints==0.3 executing @ file:///opt/conda/conda-bld/executing_1646925071911/work h5py==3.6.0 idna==3.3 importlib-metadata @ file:///opt/concourse/worker/volumes/live/9f82d2cd-acb0-490f-6640-deddcdde7174/volume/importlib-metadata_1648562428986/work ipykernel @ file:///opt/concourse/worker/volumes/live/675cc243-e35d-4e85-47e2-838adcdd7698/volume/ipykernel_1647000791373/work/dist/ipykernel-6.9.1-py3-none-any.whl ipython @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_croot-uzw2qxt6/ipython_1646949517112/work ipython-genutils @ file:///tmp/build/80754af9/ipython_genutils_1606773439826/work ipywidgets @ file:///tmp/build/80754af9/ipywidgets_1634143127070/work jedi @ file:///opt/concourse/worker/volumes/live/fb2e45f1-1dff-42fa-6cab-bd094a4c4372/volume/jedi_1644315264796/work Jinja2 @ file:///opt/conda/conda-bld/jinja2_1647436528585/work jsonschema @ file:///Users/ktietz/demo/mc3/conda-bld/jsonschema_1630511932244/work jupyter==1.0.0 jupyter-client @ file:///opt/conda/conda-bld/jupyter_client_1643638337975/work jupyter-console @ file:///opt/conda/conda-bld/jupyter_console_1647002188872/work jupyter-core @ file:///opt/concourse/worker/volumes/live/d74cfa15-eb85-4119-7aea-67f807ff4440/volume/jupyter_core_1646994469397/work jupyterlab-pygments @ file:///tmp/build/80754af9/jupyterlab_pygments_1601490720602/work jupyterlab-widgets @ file:///tmp/build/80754af9/jupyterlab_widgets_1609884341231/work kaleido==0.2.1 llvmlite==0.38.0 Markdown==3.3.6 MarkupSafe @ file:///opt/concourse/worker/volumes/live/c9141381-1dba-485b-7c96-99007bf7bcfd/volume/markupsafe_1621528150226/work matplotlib-inline @ file:///tmp/build/80754af9/matplotlib-inline_1628242447089/work mistune @ file:///opt/concourse/worker/volumes/live/95802d64-d39c-491b-74ce-b9326880ca54/volume/mistune_1594373201816/work mygene==3.2.2 nb-conda-kernels @ file:///opt/concourse/worker/volumes/live/82c137e0-23e6-475b-7bb1-19e036b0f8ea/volume/nb_conda_kernels_1606775964383/work nbclient @ file:///tmp/build/80754af9/nbclient_1645431659072/work nbconvert @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/croot-r4wh23j5/nbconvert_1641309205559/work nbformat @ file:///tmp/build/80754af9/nbformat_1617383369282/work nest-asyncio @ file:///tmp/build/80754af9/nest-asyncio_1613680548246/work notebook @ file:///opt/concourse/worker/volumes/live/04354d80-9f15-4db9-4bcd-195dc104f1c6/volume/notebook_1645002579570/work numba==0.55.1 numpy==1.19.2 packaging @ file:///tmp/build/80754af9/packaging_1637314298585/work pandas==1.4.0 pandocfilters @ file:///opt/conda/conda-bld/pandocfilters_1643405455980/work panel==0.10.3 param==1.10.0 parso @ file:///opt/conda/conda-bld/parso_1641458642106/work patsy==0.5.2 pdfrw==0.4 pexpect @ file:///tmp/build/80754af9/pexpect_1605563209008/work pickleshare @ file:///tmp/build/80754af9/pickleshare_1606932040724/work Pillow==9.1.0 plotly==4.12.0 prometheus-client @ file:///opt/conda/conda-bld/prometheus_client_1643788673601/work prompt-toolkit @ file:///tmp/build/80754af9/prompt-toolkit_1633440160888/work psutil==5.8.0 ptyprocess @ file:///tmp/build/80754af9/ptyprocess_1609355006118/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl pure-eval @ file:///opt/conda/conda-bld/pure_eval_1646925070566/work pycparser @ file:///tmp/build/80754af9/pycparser_1636541352034/work pyct==0.4.8 Pygments @ file:///opt/conda/conda-bld/pygments_1644249106324/work pyparsing @ file:///tmp/build/80754af9/pyparsing_1635766073266/work pyrsistent @ file:///opt/concourse/worker/volumes/live/ac52847c-5646-4583-7286-af47061006cb/volume/pyrsistent_1636111017177/work pyteomics==4.3.3 python-dateutil @ file:///tmp/build/80754af9/python-dateutil_1626374649649/work pytz==2022.1 pyviz-comms==2.2.0 PyYAML==6.0 pyzmq @ file:///opt/concourse/worker/volumes/live/12b008ff-2acb-4e15-682d-fce50395a9d4/volume/pyzmq_1638434992289/work qtconsole @ file:///opt/conda/conda-bld/qtconsole_1643819126524/work QtPy @ file:///opt/conda/conda-bld/qtpy_1643087291789/work reportlab==3.5.59 requests==2.27.1 retrying==1.3.3 scipy==1.8.0 Send2Trash @ file:///tmp/build/80754af9/send2trash_1632406701022/work six @ file:///tmp/build/80754af9/six_1644875935023/work stack-data @ file:///opt/conda/conda-bld/stack_data_1646927590127/work statsmodels==0.13.2 structuremap==0.0.8 terminado @ file:///opt/concourse/worker/volumes/live/77a7a2fc-5569-4ff8-4f8c-22be111c7b5e/volume/terminado_1644322600385/work testpath @ file:///tmp/build/80754af9/testpath_1624638946665/work tornado @ file:///opt/concourse/worker/volumes/live/05341796-4198-4ded-4a9a-332fde3cdfd1/volume/tornado_1606942323372/work tqdm==4.63.1 traitlets @ file:///tmp/build/80754af9/traitlets_1636710298902/work typing_extensions @ file:///opt/conda/conda-bld/typing_extensions_1647553014482/work urllib3==1.26.9 wcwidth @ file:///Users/ktietz/demo/mc3/conda-bld/wcwidth_1629357192024/work webencodings==0.5.1 widgetsnbextension @ file:///opt/concourse/worker/volumes/live/b79a6318-0a66-44c8-744e-87eba0d202e8/volume/widgetsnbextension_1645009378090/work zipp @ file:///opt/conda/conda-bld/zipp_1641824620731/work